{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Simple Linear Regression for stock using scikit-learn\n"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import math\n",
        "import seaborn as sns\n",
        "%matplotlib inline\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "import fix_yahoo_finance as yf\n",
        "yf.pdr_override()"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "stock = 'AAPL'\n",
        "start = '2016-01-01' \n",
        "end = '2018-01-01'\n",
        "data = yf.download(stock, start, end)\n",
        "data.head()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[*********************100%***********************]  1 of 1 downloaded\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 2,
          "data": {
            "text/plain": "                  Open        High         Low       Close   Adj Close  \\\nDate                                                                     \n2016-01-04  102.610001  105.370003  102.000000  105.349998  100.274513   \n2016-01-05  105.750000  105.849998  102.410004  102.709999   97.761681   \n2016-01-06  100.559998  102.370003   99.870003  100.699997   95.848511   \n2016-01-07   98.680000  100.129997   96.430000   96.449997   91.803276   \n2016-01-08   98.550003   99.110001   96.760002   96.959999   92.288696   \n\n              Volume  \nDate                  \n2016-01-04  67649400  \n2016-01-05  55791000  \n2016-01-06  68457400  \n2016-01-07  81094400  \n2016-01-08  70798000  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Open</th>\n      <th>High</th>\n      <th>Low</th>\n      <th>Close</th>\n      <th>Adj Close</th>\n      <th>Volume</th>\n    </tr>\n    <tr>\n      <th>Date</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2016-01-04</th>\n      <td>102.610001</td>\n      <td>105.370003</td>\n      <td>102.000000</td>\n      <td>105.349998</td>\n      <td>100.274513</td>\n      <td>67649400</td>\n    </tr>\n    <tr>\n      <th>2016-01-05</th>\n      <td>105.750000</td>\n      <td>105.849998</td>\n      <td>102.410004</td>\n      <td>102.709999</td>\n      <td>97.761681</td>\n      <td>55791000</td>\n    </tr>\n    <tr>\n      <th>2016-01-06</th>\n      <td>100.559998</td>\n      <td>102.370003</td>\n      <td>99.870003</td>\n      <td>100.699997</td>\n      <td>95.848511</td>\n      <td>68457400</td>\n    </tr>\n    <tr>\n      <th>2016-01-07</th>\n      <td>98.680000</td>\n      <td>100.129997</td>\n      <td>96.430000</td>\n      <td>96.449997</td>\n      <td>91.803276</td>\n      <td>81094400</td>\n    </tr>\n    <tr>\n      <th>2016-01-08</th>\n      <td>98.550003</td>\n      <td>99.110001</td>\n      <td>96.760002</td>\n      <td>96.959999</td>\n      <td>92.288696</td>\n      <td>70798000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 2,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df = data.reset_index()\n",
        "df.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/plain": "        Date        Open        High         Low       Close   Adj Close  \\\n0 2016-01-04  102.610001  105.370003  102.000000  105.349998  100.274513   \n1 2016-01-05  105.750000  105.849998  102.410004  102.709999   97.761681   \n2 2016-01-06  100.559998  102.370003   99.870003  100.699997   95.848511   \n3 2016-01-07   98.680000  100.129997   96.430000   96.449997   91.803276   \n4 2016-01-08   98.550003   99.110001   96.760002   96.959999   92.288696   \n\n     Volume  \n0  67649400  \n1  55791000  \n2  68457400  \n3  81094400  \n4  70798000  ",
            "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Date</th>\n      <th>Open</th>\n      <th>High</th>\n      <th>Low</th>\n      <th>Close</th>\n      <th>Adj Close</th>\n      <th>Volume</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>2016-01-04</td>\n      <td>102.610001</td>\n      <td>105.370003</td>\n      <td>102.000000</td>\n      <td>105.349998</td>\n      <td>100.274513</td>\n      <td>67649400</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2016-01-05</td>\n      <td>105.750000</td>\n      <td>105.849998</td>\n      <td>102.410004</td>\n      <td>102.709999</td>\n      <td>97.761681</td>\n      <td>55791000</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2016-01-06</td>\n      <td>100.559998</td>\n      <td>102.370003</td>\n      <td>99.870003</td>\n      <td>100.699997</td>\n      <td>95.848511</td>\n      <td>68457400</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>2016-01-07</td>\n      <td>98.680000</td>\n      <td>100.129997</td>\n      <td>96.430000</td>\n      <td>96.449997</td>\n      <td>91.803276</td>\n      <td>81094400</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>2016-01-08</td>\n      <td>98.550003</td>\n      <td>99.110001</td>\n      <td>96.760002</td>\n      <td>96.959999</td>\n      <td>92.288696</td>\n      <td>70798000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
          },
          "metadata": {}
        }
      ],
      "execution_count": 3,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "X = df.drop(['Date','Close'], axis=1, inplace=True)\n",
        "y = df[['Adj Close']]"
      ],
      "outputs": [],
      "execution_count": 4,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df = df.as_matrix()"
      ],
      "outputs": [],
      "execution_count": 5,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "# Split X and y into X_\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,  random_state=0)"
      ],
      "outputs": [],
      "execution_count": 6,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.linear_model import LinearRegression\n",
        "\n",
        "regression_model = LinearRegression()\n",
        "regression_model.fit(X_train, y_train)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
          },
          "metadata": {}
        }
      ],
      "execution_count": 7,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "intercept = regression_model.intercept_[0]\n",
        "\n",
        "print(\"The intercept for our model is {}\".format(intercept))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The intercept for our model is -1.2047109976265347e-09\n"
          ]
        }
      ],
      "execution_count": 8,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "regression_model.score(X_test, y_test)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 9,
          "data": {
            "text/plain": "1.0"
          },
          "metadata": {}
        }
      ],
      "execution_count": 9,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import mean_squared_error\n",
        "\n",
        "y_predict = regression_model.predict(X_test)\n",
        "\n",
        "regression_model_mse = mean_squared_error(y_predict, y_test)\n",
        "\n",
        "regression_model_mse"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 10,
          "data": {
            "text/plain": "2.8264629110010686e-19"
          },
          "metadata": {}
        }
      ],
      "execution_count": 10,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "math.sqrt(regression_model_mse)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 11,
          "data": {
            "text/plain": "5.316448919157475e-10"
          },
          "metadata": {}
        }
      ],
      "execution_count": 11,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# input the latest Open, High, Low, Close, Volume\n",
        "# predicts the next day price\n",
        "regression_model.predict([[167.81, 171.75, 165.19, 166.48, 37232900]])"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 12,
          "data": {
            "text/plain": "array([[166.48]])"
          },
          "metadata": {}
        }
      ],
      "execution_count": 12,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "python3"
    },
    "kernelspec": {
      "name": "python3",
      "language": "python",
      "display_name": "Python 3"
    },
    "language_info": {
      "file_extension": ".py",
      "pygments_lexer": "ipython3",
      "version": "3.5.5",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "name": "python",
      "nbconvert_exporter": "python"
    },
    "nteract": {
      "version": "0.28.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}